2 Diving In

Back to top.

2.1 Libraries and Source Files

library(tidyverse)
# library(dict) # Still not found after installation
library(container) # For Dict class
library(useful) # For simple.impute
library(comprehenr) # For list comprehension
library(GGally)
library(reshape2)
library(gridExtra)
library(gplots)
library(DescTools) # For df summary
library(robustHD) # For df summary
library(caret)
library(effsize) # For Cohen's d

source('tools/wrangle.R')
source('tools/eda.R')
source('tools/engineer.R')
source('tools/split.R')

# SEED = 65466

2.2 Loading Data

Here’s the wrangled training set loaded from objects serialized at the end of wrangle_and_split.Rmd.

val_train_Xy = readRDS("data/eda_pt1_val_train_Xy.rds")
id_col = val_train_Xy$Id
val_train_Xy = select(val_train_Xy, -c('Id'))

3 EDA and Engineering

3.1 Recalc Best Transformers

funcs_lst = list(
    'no_func' = function (x) { x },
    'sqrt' = sqrt,
    'cbrt' = function(x) { x^(1/3) },
    'square' = function(x) { x^2 },
###
### FIXME
# Make log transformations of x+1.
###
    'log' = log,
    'log2' = log2,
    'log10' = log10,
    '1/x' = function (x) { 1/x },
    '2^(1/x)' = function (x) { 2^(1/x) }
    # Box Cox: write function that calls MASS::boxcox and include lambda in results/function returned.
    # Yeo-Johnson
    # Winsorize here?
    # Rank
    # Rank-Gauss
  )
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

# print("Best normalizing transformations:")
# for (feat in names(best_normalizers)) {
#   func_name = best_normalizers[[feat]]$best_func$name
#   print(
#     paste(
#       feat, ":", func_name,
#       ", p-value:", best_normalizers[[feat]]$results[[func_name]]$p.value
#     )
#   )
# }

3.2 Heating

Back to top.

Mostly gas forced air (GasA, 697). Can probably drop this.

summary(val_train_Xy$Heating)
##  None Floor  GasA  GasW  Grav  OthW  Wall 
##     0     0   702     7     2     1     3
val_train_Xy = select(val_train_Xy, -c('Heating'))

3.3 HeatingQC

Back to top.

Mostly excellent (360), then interestingly more average (211) than good (117). Enough variance to keep, but not normal.

x = 'HeatingQC'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Ex" "TA" "Gd"

3.4 CentralAir

Back to top.

662 yes, 53 no. Probably drop this one.

summary(val_train_Xy$CentralAir)
##   N   Y 
##  39 676
val_train_Xy = select(val_train_Xy, -c('CentralAir'))

3.5 Electrical

Back to top.

655 standard circuit breaker and Romex. Probably drop, but there are none mixed, so it might be worth making it an ordered factor if keeping.

summary(val_train_Xy$Electrical)
##  None SBrkr FuseA FuseF FuseP   Mix 
##     0   650    49    14     1     1
val_train_Xy = select(val_train_Xy, -c('Electrical'))

3.6 X1stFlrSF

Back to top.

Dropping as component of GrLivArea.

summary(val_train_Xy$X1stFlrSF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334     881    1086    1167    1392    4692
val_train_Xy = select(val_train_Xy, -c('X1stFlrSF'))

3.7 X2ndFlrSF

Back to top.

Dropping as component of GrLivArea. The presence of a second floor is encoded in HouseStyle, but it’s also convoluted between a few levels. I’ll make a binary out of it and keep that.

3.7.1 Binarize

val_train_Xy = val_train_Xy %>%
  mutate('X2ndFlr.bin' = ifelse(X2ndFlrSF <= 0, 0, 1)) %>%
  mutate('X2ndFlr.bin.fact' = factor(X2ndFlr.bin, ordered = T))

x = 'X2ndFlr.bin'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
x = 'X2ndFlr.bin.fact'
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "1" "0"
val_train_Xy = select(val_train_Xy, -c('X2ndFlr.bin.fact'))

3.8 LowQualFinSF

Back to top.

702 0s. Drop this feature, though it would be a useful modifier of GrLivArea as a detracting component of it.

summary(val_train_Xy$LowQualFinSF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   4.846   0.000 528.000
val_train_Xy = select(val_train_Xy, -c('LowQualFinSF'))

3.9 GrLivArea

Back to top.

3.9.1 Normalize

As this is polymodal to the number of floors, I’ll start by faceting before transforming.

ggplot(val_train_Xy, aes(x = GrLivArea)) +
  geom_histogram(binwidth = 50) +
  facet_wrap(facets = vars(X2ndFlr.bin), ncol = 1)

x = 'GrLivArea'
summary(val_train_Xy[x])
##    GrLivArea   
##  Min.   : 334  
##  1st Qu.:1129  
##  Median :1456  
##  Mean   :1515  
##  3rd Qu.:1791  
##  Max.   :5642
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 50,
  t_binw = .1
)

## NULL
x = 'log2(GrLivArea)'

val_train_Xy = val_train_Xy %>%
  mutate('log2(GrLivArea)' = log2(GrLivArea))

ggplot(val_train_Xy, aes(x = .data[[x]])) +
  geom_histogram(binwidth = .1) +
  facet_wrap(facets = vars(X2ndFlr.bin), ncol = 1)

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  log2(GrLivArea) 
##  Min.   : 8.384  
##  1st Qu.:10.141  
##  Median :10.508  
##  Mean   :10.486  
##  3rd Qu.:10.807  
##  Max.   :12.462
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = .1,
  t_binw = 1
)

## NULL
x = 'square(log2(GrLivArea))'

val_train_Xy = val_train_Xy %>%
  mutate('square(log2(GrLivArea))' = log2(GrLivArea)^2)

ggplot(val_train_Xy, aes(x = .data[[x]])) +
  geom_histogram(binwidth = 1) +
  facet_wrap(facets = vars(X2ndFlr.bin), ncol = 1)

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  square(log2(GrLivArea))
##  Min.   : 70.29         
##  1st Qu.:102.84         
##  Median :110.41         
##  Mean   :110.18         
##  3rd Qu.:116.78         
##  Max.   :155.30
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1,
  t_binw = 1
)

## NULL
ggplot(val_train_Xy, aes(x = `square(log2(GrLivArea))`)) +
  geom_histogram(binwidth = 1) +
  facet_wrap(facets = vars(.data$X2ndFlr.bin), ncol = 1)

ggplot(val_train_Xy, aes(x = `square(log2(GrLivArea))`)) +
  geom_histogram(binwidth = 1) +
  facet_grid(
    cols = vars(.data$X2ndFlr.bin),
    rows = vars(.data$YearBuilt.fact)
  )

It looks like there might be more polymodality, particularly among mid-century builds, but I’ll leave it here. The transformations seem to have better normalized both the full variable and its subsets by storey and age. Winsorization should help both too.

3.9.2 Winsorize

x = 'square(log2(GrLivArea))'

qqnorm(y = val_train_Xy$GrLivArea, ylab = 'GrLivArea')
qqline(y = val_train_Xy$GrLivArea, ylab = 'GrLivArea')

qqnorm(y = val_train_Xy$`log2(GrLivArea)`, ylab = 'log2(GrLivArea)')
qqline(y = val_train_Xy$`log2(GrLivArea)`, ylab = 'log2(GrLivArea)')

qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)

Win_log2_x_squared = Winsorize(
  x = val_train_Xy[[x]],
  probs = c(0.002, 0.998),
  na.rm = T
)

qqnorm(y = Win_log2_x_squared, ylab = 'Win_log2_x_squared')
qqline(y = Win_log2_x_squared, ylab = 'Win_log2_x_squared')

Win_log2_x = Winsorize(
  x = val_train_Xy$`log2(GrLivArea)`,
  probs = c(0.002, 0.998),
  na.rm = T
)

qqnorm(y = Win_log2_x, ylab = 'Win_log2_x')
qqline(y = Win_log2_x, ylab = 'Win_log2_x')

Win_raw_x = Winsorize(
  x = val_train_Xy$GrLivArea,
  probs = c(0, 0.95),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')

print(shapiro.test(x = val_train_Xy$GrLivArea))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$GrLivArea
## W = 0.92124, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`log2(GrLivArea)`))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$`log2(GrLivArea)`
## W = 0.99301, p-value = 0.002004
print(shapiro.test(x = val_train_Xy[['square(log2(GrLivArea))']]))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy[["square(log2(GrLivArea))"]]
## W = 0.99284, p-value = 0.001655
print(shapiro.test(x = Win_log2_x_squared))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_log2_x_squared
## W = 0.99603, p-value = 0.06738
print(shapiro.test(x = Win_log2_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_log2_x
## W = 0.99606, p-value = 0.06957
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.97112, p-value = 1.136e-10

Winsorizing the log2 better normalized than squaring it, but Winsorizing the squared log2 is close, too. May be overfit to add the square.

val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(log2(GrLivArea))' = Winsorize(
      ifelse(GrLivArea <= 0, 0, log2(GrLivArea)),
      probs = c(0.002, 0.998),
      na.rm = T
    )
  ) %>%
  mutate(
    'Win(square(log2(GrLivArea)))' = Winsorize(
      ifelse(GrLivArea <= 0, 0, log2(GrLivArea)^2),
      probs = c(0.002, 0.998),
      na.rm = T
    )
  )

3.9.3 Correlations

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('GrLivArea', 'log2(GrLivArea)', 'square(log2(GrLivArea))', 'Win(log2(GrLivArea))', 'Win(square(log2(GrLivArea)))')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##    GrLivArea        log2(GrLivArea)    square(log2(GrLivArea))
##  Min.   :0.004074   Min.   :0.007031   Min.   :0.005632       
##  1st Qu.:0.170723   1st Qu.:0.138405   1st Qu.:0.144044       
##  Median :0.300589   Median :0.308047   Median :0.308343       
##  Mean   :0.320766   Mean   :0.319385   Mean   :0.320768       
##  3rd Qu.:0.444557   3rd Qu.:0.439274   3rd Qu.:0.442560       
##  Max.   :0.829900   Max.   :0.826904   Max.   :0.831350       
##  Win(log2(GrLivArea)) Win(square(log2(GrLivArea)))
##  Min.   :0.007115     Min.   :0.005648            
##  1st Qu.:0.127613     1st Qu.:0.128787            
##  Median :0.306231     Median :0.304675            
##  Mean   :0.317085     Mean   :0.318136            
##  3rd Qu.:0.428208     3rd Qu.:0.429922            
##  Max.   :0.827504     Max.   :0.831948
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = c('log(SalePrice)')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}

3.9.4 Hard Code

The Winsorized double transformation may be slightly overfitting and overcomputing, but it’s slightly (likely insignificantly) more normal and correlated to SalePrice than the Winsorized single transformation. I’ll go with it despite common sense, just for fun.

x = 'Win(square(log2(GrLivArea)))'

min_val = min(Win_log2_x_squared)
max_val = max(Win_log2_x_squared)
print(paste("min_val:", min_val))
## [1] "min_val: 78.8827556705776"
print(paste("max_val:", max_val))
## [1] "max_val: 143.236548514549"
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(square(log2(GrLivArea)))' = Winsorize(
      ifelse(GrLivArea <= 0, 0, log2(GrLivArea)^2),
      # probs = c(0.002, 0.998),
      # na.rm = T
      minval = min_val,
      maxval = max_val
    )
  ) %>%
  select(-c('log2(GrLivArea)', 'Win(log2(GrLivArea))'))

gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)

3.10 BsmtFullBath

Back to top.

To contribute to full count.

3.11 BsmtHalfBath

Back to top.

To contribute to full count.

3.12 FullBath

Back to top.

To contribute to full count.

3.13 HalfBath

Back to top.

To contribute to full count.

3.14 TotBaths

Back to top.

I could make two features, total baths above grade and total basement baths, but I’ll keep it simple.

3.14.1 Normalize

val_train_Xy = val_train_Xy %>%
  mutate(TotBaths = FullBath + BsmtFullBath + 0.5*HalfBath + 0.5*BsmtHalfBath)

x = 'TotBaths'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL

ggplot(val_train_Xy, aes(x = factor(TotBaths), y = log(SalePrice))
) +
  geom_jitter(alpha = 0.5, aes(color = factor(BsmtFullBath + BsmtHalfBath))) +
  geom_boxplot(
    notch = T,
    notchwidth = .1,
    varwidth = T,
    alpha = 0,
    color = 'blue'
  ) +
  geom_violin(alpha = 0) +
  geom_line(
    stat = 'summary',
    fun = quantile,
    fun.args = list(probs = .9),
    linetype = 2, aes(group = 1)
  ) +
  geom_line(stat = 'summary', fun = mean, mapping = aes(group = 1)) +
  geom_line(
    stat = 'summary',
    fun = quantile,
    fun.args = list(probs = .1),
    linetype = 2, aes(group = 1)
  ) +
  xlab(label = 'TotBaths') + ylab(label = 'log(SalePrice)')

Basement bathrooms don’t seem to add much value when there aren’t many bathrooms altogether.

p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "3.5" "2"   "3"   "1"   "1.5" "2.5"
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##     TotBaths    
##  Min.   :1.000  
##  1st Qu.:2.000  
##  Median :2.000  
##  Mean   :2.217  
##  3rd Qu.:2.500  
##  Max.   :6.000
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 0.5,
  t_binw = 0.1
)

## NULL
x = 'sqrt(TotBaths)'
val_train_Xy = val_train_Xy %>%
  mutate('sqrt(TotBaths)' = sqrt(TotBaths))

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  sqrt(TotBaths) 
##  Min.   :1.000  
##  1st Qu.:1.414  
##  Median :1.414  
##  Mean   :1.464  
##  3rd Qu.:1.581  
##  Max.   :2.449
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = .1,
  t_binw = .1
)

## NULL

3.14.2 Winsorize

x = 'sqrt(TotBaths)'

qqnorm(y = val_train_Xy$TotBaths, ylab = 'TotBaths')
qqline(y = val_train_Xy$TotBaths, ylab = 'TotBaths')

qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)

Win_sqrt_x = Winsorize(
  x = val_train_Xy[[x]],
  probs = c(0.0, 0.999),
  na.rm = T
)

qqnorm(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
qqline(y = Win_sqrt_x, ylab = 'Win_sqrt_x')

Win_raw_x = Winsorize(
  x = val_train_Xy$TotBaths,
  probs = c(0, 0.998),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')

print(shapiro.test(x = val_train_Xy$TotBaths))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$TotBaths
## W = 0.92874, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`sqrt(TotBaths)`))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$`sqrt(TotBaths)`
## W = 0.9241, p-value < 2.2e-16
print(shapiro.test(x = Win_sqrt_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_sqrt_x
## W = 0.92391, p-value < 2.2e-16
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.93136, p-value < 2.2e-16

Maybe just lightly top-coding the raw variable is best.

val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(sqrt(TotBaths))' = Winsorize(
      sqrt(TotBaths),
      probs = c(0, 0.999),
      na.rm = T
    )
  ) %>%
  mutate(
    'Win(TotBaths)' = Winsorize(
      TotBaths,
      probs = c(0, 0.998),
      na.rm = T
    )
  )

3.14.3 Correlations

x = 'Win(TotBaths)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotBaths', 'sqrt(TotBaths)', 'Win(sqrt(TotBaths))', 'Win(TotBaths)')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##     TotBaths        sqrt(TotBaths)     Win(sqrt(TotBaths)) Win(TotBaths)     
##  Min.   :0.004047   Min.   :0.001146   Min.   :0.0004721   Min.   :0.002175  
##  1st Qu.:0.157148   1st Qu.:0.155984   1st Qu.:0.1568243   1st Qu.:0.160011  
##  Median :0.329253   Median :0.331356   Median :0.3319365   Median :0.331592  
##  Mean   :0.329428   Mean   :0.328623   Mean   :0.3294207   Mean   :0.332448  
##  3rd Qu.:0.487119   3rd Qu.:0.483214   3rd Qu.:0.4844607   3rd Qu.:0.493532  
##  Max.   :0.687363   Max.   :0.687107   Max.   :0.6868680   Max.   :0.688339
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = c('log(SalePrice)')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}

3.14.4 Hard Code

x = 'Win(TotBaths)'

min_val = min(Win_raw_x)
max_val = max(Win_raw_x)
print(paste("min_val:", min_val))
## [1] "min_val: 1"
print(paste("max_val:", max_val))
## [1] "max_val: 4.786"
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(TotBaths)' = Winsorize(
      FullBath + BsmtFullBath + 0.5*HalfBath + 0.5*BsmtHalfBath,
      # probs = c(0.002, 0.998),
      # na.rm = T
      minval = min_val,
      maxval = max_val
    )
  ) %>%
  select(-c('sqrt(TotBaths)', 'Win(sqrt(TotBaths))'))

gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 0.5)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)

3.15 BedroomAbvGr

Back to top.

3.15.1 Normalize

x = 'BedroomAbvGr'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'

df = select(val_train_Xy, c('log(SalePrice)', 'BedroomAbvGr'))
df$BedroomAbvGr.fact = factor(df$BedroomAbvGr)
sum_and_trans_fact(data = df, x = 'BedroomAbvGr.fact', y = y)

## NULL
p_vals = get_signif_levels(
  data = df,
  x = 'BedroomAbvGr.fact',
  z = y,
  min_n = 30
)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "3" "2" "4"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('BedroomAbvGr')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##   BedroomAbvGr     
##  Min.   :0.004635  
##  1st Qu.:0.041301  
##  Median :0.090023  
##  Mean   :0.151626  
##  3rd Qu.:0.209621  
##  Max.   :0.654507
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 0.5,
  t_binw = 0.1
)

## NULL
x = 'sqrt(BedroomAbvGr)'
val_train_Xy = val_train_Xy %>%
  mutate('sqrt(BedroomAbvGr)' = sqrt(BedroomAbvGr))

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  sqrt(BedroomAbvGr)
##  Min.   :1.000     
##  1st Qu.:1.414     
##  Median :1.732     
##  Mean   :1.682     
##  3rd Qu.:1.732     
##  Max.   :2.828
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = .1,
  t_binw = .1
)

## NULL

3.15.2 Winsorize

x = 'sqrt(BedroomAbvGr)'

qqnorm(y = val_train_Xy$BedroomAbvGr, ylab = 'BedroomAbvGr')
qqline(y = val_train_Xy$BedroomAbvGr, ylab = 'BedroomAbvGr')

qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)

Win_sqrt_x = Winsorize(
  x = val_train_Xy[[x]],
  probs = c(0, 0.999),
  na.rm = T
)

qqnorm(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
qqline(y = Win_sqrt_x, ylab = 'Win_sqrt_x')

Win_raw_x = Winsorize(
  x = val_train_Xy$BedroomAbvGr,
  probs = c(0, 0.995),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')

print(shapiro.test(x = val_train_Xy$BedroomAbvGr))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$BedroomAbvGr
## W = 0.8457, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`sqrt(BedroomAbvGr)`))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$`sqrt(BedroomAbvGr)`
## W = 0.84731, p-value < 2.2e-16
print(shapiro.test(x = Win_sqrt_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_sqrt_x
## W = 0.8486, p-value < 2.2e-16
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.8574, p-value < 2.2e-16

Maybe just lightly Winsorizing the raw variable is best.

val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(sqrt(BedroomAbvGr))' = Winsorize(
      sqrt(BedroomAbvGr),
      probs = c(0, 0.999),
      na.rm = T
    )
  ) %>%
  mutate(
    'Win(BedroomAbvGr)' = Winsorize(
      BedroomAbvGr,
      probs = c(0, 0.995),
      na.rm = T
    )
  )

3.15.3 Correlations

x = 'Win(BedroomAbvGr)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('BedroomAbvGr', 'sqrt(BedroomAbvGr)', 'Win(sqrt(BedroomAbvGr))', 'Win(BedroomAbvGr)')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##   BedroomAbvGr      sqrt(BedroomAbvGr) Win(sqrt(BedroomAbvGr))
##  Min.   :0.004635   Min.   :0.001827   Min.   :0.002047       
##  1st Qu.:0.041301   1st Qu.:0.044497   1st Qu.:0.044839       
##  Median :0.090023   Median :0.094369   Median :0.094250       
##  Mean   :0.151626   Mean   :0.150984   Mean   :0.151281       
##  3rd Qu.:0.209621   3rd Qu.:0.216792   3rd Qu.:0.217943       
##  Max.   :0.654507   Max.   :0.638464   Max.   :0.635445       
##  Win(BedroomAbvGr) 
##  Min.   :0.003767  
##  1st Qu.:0.041751  
##  Median :0.091144  
##  Mean   :0.154296  
##  3rd Qu.:0.219703  
##  Max.   :0.649031
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = c('log(SalePrice)')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}

3.15.4 Hard Code

x = 'Win(BedroomAbvGr)'

min_val = min(Win_raw_x)
max_val = max(Win_raw_x)
print(paste("min_val:", min_val))
## [1] "min_val: 1"
print(paste("max_val:", max_val))
## [1] "max_val: 5.42999999999995"
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(BedroomAbvGr)' = Winsorize(
      BedroomAbvGr,
      # probs = c(0.002, 0.998),
      # na.rm = T
      minval = min_val,
      maxval = max_val
    )
  ) %>%
  select(-c('sqrt(BedroomAbvGr)', 'Win(sqrt(BedroomAbvGr))'))

gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)

3.16 KitchenAbvGr

Back to top.

summary(val_train_Xy$KitchenAbvGr)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   1.000   1.041   1.000   3.000
val_train_Xy = select(val_train_Xy, -c('KitchenAbvGr'))

3.17 KitchenQual

Back to top.

x = 'KitchenQual'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Gd" "TA" "Ex"

3.18 TotRmsAbvGrd

Back to top.

x = 'TotRmsAbvGrd'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'

df = select(val_train_Xy, c('log(SalePrice)', 'TotRmsAbvGrd'))
df$TotRmsAbvGrd.fact = factor(df$TotRmsAbvGrd)
sum_and_trans_fact(data = df, x = 'TotRmsAbvGrd.fact', y = y)

## NULL
p_vals = get_signif_levels(
  data = df,
  x = 'TotRmsAbvGrd.fact',
  z = y,
  min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "8" "5" "6" "4" "7" "9"
summary(val_train_Xy[x])
##   TotRmsAbvGrd   
##  Min.   : 2.000  
##  1st Qu.: 5.000  
##  Median : 6.000  
##  Mean   : 6.543  
##  3rd Qu.: 7.000  
##  Max.   :14.000
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1,
  t_binw = 1
)

## NULL
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotRmsAbvGrd')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##   TotRmsAbvGrd    
##  Min.   :0.00436  
##  1st Qu.:0.10320  
##  Median :0.22718  
##  Mean   :0.28644  
##  3rd Qu.:0.40813  
##  Max.   :0.83195

3.18.1 Winsorize

x = 'TotRmsAbvGrd'

qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)

Win_raw_x = Winsorize(
  x = val_train_Xy$TotRmsAbvGrd,
  probs = c(0, 0.975),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'Win(TotRmsAbvGrd)')
qqline(y = Win_raw_x, ylab = 'Win(TotRmsAbvGrd)')

print(shapiro.test(x = val_train_Xy$TotRmsAbvGrd))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$TotRmsAbvGrd
## W = 0.93616, p-value < 2.2e-16
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.94556, p-value = 1.508e-15
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(TotRmsAbvGrd)' = Winsorize(
      TotRmsAbvGrd,
      probs = c(0, 0.975),
      na.rm = T
    )
  )

3.18.2 Correlations

x = 'Win(TotRmsAbvGrd)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotRmsAbvGrd', 'Win(TotRmsAbvGrd)')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##   TotRmsAbvGrd     Win(TotRmsAbvGrd) 
##  Min.   :0.00436   Min.   :0.006598  
##  1st Qu.:0.10320   1st Qu.:0.092855  
##  Median :0.22718   Median :0.224392  
##  Mean   :0.28644   Mean   :0.286050  
##  3rd Qu.:0.40813   3rd Qu.:0.414866  
##  Max.   :0.83195   Max.   :0.833412
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = c('log(SalePrice)')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}

3.18.3 Hard Code

x = 'Win(TotRmsAbvGrd)'

min_val = min(Win_raw_x)
max_val = max(Win_raw_x)
print(paste("min_val:", min_val))
## [1] "min_val: 2"
print(paste("max_val:", max_val))
## [1] "max_val: 10.15"
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(TotRmsAbvGrd)' = Winsorize(
      TotRmsAbvGrd,
      # probs = c(0, 0.975),
      # na.rm = T
      minval = min_val,
      maxval = max_val
    )
  )

gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)

3.19 Functional

Back to top.

x = 'Functional'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)

3.20 Fireplaces

Back to top.

x = 'Fireplaces'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
val_train_Xy$Fireplaces.fact = factor(val_train_Xy$Fireplaces, ordered = T)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = 'Fireplaces.fact', y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = 'Fireplaces.fact', z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "0" "2" "1"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('Fireplaces')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##    Fireplaces       
##  Min.   :0.0001063  
##  1st Qu.:0.1125161  
##  Median :0.2436102  
##  Mean   :0.2242305  
##  3rd Qu.:0.3210328  
##  Max.   :0.4867665
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

plot_scat_pairs(df = val_train_Xy, x = x, y_lst = c(y))

## NULL

The variable needs more than three unique values in order to use my function to check for transformations, and 0 can’t be one of them in order to include log transformations. So, I’ll use Fireplaces + 1 to search for transformations.

val_train_Xy$Fireplaces_plus1 = val_train_Xy$Fireplaces + 1
x = 'Fireplaces_plus1'

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1,
  t_binw = 1
)

## NULL

3.20.1 Winsorize

x = 'Fireplaces'

qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)

Win_raw_x = Winsorize(
  x = val_train_Xy$Fireplaces,
  probs = c(0, 0.999),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')

print(shapiro.test(x = val_train_Xy$Fireplaces))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$Fireplaces
## W = 0.76773, p-value < 2.2e-16
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.76773, p-value < 2.2e-16

Winsorization doesn’t help.

3.20.2 Binarize

There’s no apparent meaningful difference in price between houses with 1 fireplace and those with 2 or 3. It’s significant (p < 0.1), but not apparently meaningful given the overlapping notches in the boxplot above, though I did not take Cohen’s d for effect size.

Binarization may condense the feature space without a lot of information loss. In fact, it improves the correlation to the target variable.

val_train_Xy = val_train_Xy %>%
  mutate(Fireplaces.bin = ifelse(Fireplaces == 0, 0, 1))

x = 'Fireplaces.bin'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
val_train_Xy$Fireplaces.bin.fact = factor(val_train_Xy$Fireplaces.bin, ordered = T)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = 'Fireplaces.bin.fact', y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = 'Fireplaces.bin.fact', z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "0" "1"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('Fireplaces', 'Fireplaces.bin')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##    Fireplaces        Fireplaces.bin    
##  Min.   :0.0001063   Min.   :0.002832  
##  1st Qu.:0.1163174   1st Qu.:0.138881  
##  Median :0.2436660   Median :0.220999  
##  Mean   :0.2380836   Mean   :0.237974  
##  3rd Qu.:0.3215369   3rd Qu.:0.299826  
##  Max.   :1.0000000   Max.   :0.886894
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

val_train_Xy = select(
  val_train_Xy,
  -c('Fireplaces.fact', 'Fireplaces_plus1', 'Fireplaces.bin.fact',
     'Fireplaces')
)

3.21 FireplaceQu

Back to top.

x = 'FireplaceQu'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "None" "TA"   "Gd"

One-hot encoding this FireplaceQu will include a binarized version (“None”:int[0,1]), so we can drop Fireplace.bin altogether, and Fireplace because it just adds noise. I’ll take care of that during modeling with caret.

3.22 GarageType

Back to top.

x = 'GarageType'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Attchd"  "Detchd"  "None"    "BuiltIn"

3.23 GarageYrBlt

Back to top.

Similar distribution as year built for house. May not add additional information, but may be useful in interaction with type and finish. Could drop and use YearBuilt as proxy but would lose some info.

x = 'GarageYrBlt'
summary(val_train_Xy[x])
##   GarageYrBlt  
##  Min.   :1908  
##  1st Qu.:1961  
##  Median :1979  
##  Mean   :1978  
##  3rd Qu.:2002  
##  Max.   :2010  
##  NA's   :40
# sum_and_trans_cont(
#   data = val_train_Xy,
#   x = x,
#   func = best_normalizers[[x]]$best_func$func,
#   func_name = best_normalizers[[x]]$best_func$name,
#   x_binw = 1,
#   t_binw = 1
# )

gg = ggplot(val_train_Xy, aes(x = GarageYrBlt))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)

3.23.1 Correlations

x = 'GarageYrBlt'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##   GarageYrBlt      
##  Min.   :0.006671  
##  1st Qu.:0.057040  
##  Median :0.177159  
##  Mean   :0.235013  
##  3rd Qu.:0.313315  
##  Max.   :0.847412
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = c('log(SalePrice)')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)

## NULL

3.23.2 “Controlling”

Even “controlling” for the year the house was built, GarageYrBlt doesn’t predict sale prices well, as seen in the next few plots. I’ll just drop it.

val_train_Xy = val_train_Xy %>%
  mutate(
    'GarBltLater' = factor(ifelse((GarageYrBlt - YearBuilt) <= 0, 0, 1))
  )

ggplot(val_train_Xy, aes(x = GarageYrBlt, y = YearBuilt)) +
  geom_jitter(
    alpha = 0.5,
    aes(
      color = SalePrice.fact
    ),
    position = position_jitter(w = 1, h = 1)
  )

fbv = fenced_jbv(
  data = val_train_Xy,
  x = 'GarBltLater',
  y = 'log(SalePrice)'
)
fbv

fbv +
  facet_wrap(facets = vars(YearBuilt.fact))

val_train_Xy$resids = lm(
  `log(SalePrice)` ~ `sqrt(Age)`,
  val_train_Xy
)$residuals

df = filter(
  val_train_Xy,
  GarageYrBlt - YearBuilt != 0
)

ggplot(df, aes(x = GarageYrBlt, y = resids)) +
  geom_point() +
  geom_smooth(method = 'lm') +
  ylab(label = 'log(SalePrice) ~ YearBuilt Residuals')

print(
  paste(
    "Correlation of GarageYRBlt to Age residual:",
    cor(df$GarageYrBlt, df$resids)
  )
)
## [1] "Correlation of GarageYRBlt to Age residual: 0.0701692753019878"
val_train_Xy = select(val_train_Xy, -c('GarageYrBlt', 'GarBltLater'))

3.24 GarageFinish

Back to top.

x = 'GarageFinish'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "RFn"  "Unf"  "None" "Fin"

3.25 GarageCars

Back to top.

x = 'GarageCars'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
val_train_Xy$GarageCars.fact = factor(
  val_train_Xy$GarageCars,
  ordered = T
)
sum_and_trans_fact(data = val_train_Xy, x = 'GarageCars.fact', y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = 'GarageCars.fact', z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "2" "1" "3" "0"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##    GarageCars      
##  Min.   :0.005684  
##  1st Qu.:0.162049  
##  Median :0.279956  
##  Mean   :0.293729  
##  3rd Qu.:0.408933  
##  Max.   :0.870258
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

plot_scat_pairs(df = val_train_Xy, x = x, y_lst = c(y))

## NULL
val_train_Xy$GarageCars.plus1 = val_train_Xy$GarageCars + 1
x = 'GarageCars.plus1'

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1,
  t_binw = 1
)

## NULL

3.25.1 Winsorize

x = 'GarageCars'

qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)

val_train_Xy$`Win(GarageCars)` = Winsorize(
  x = val_train_Xy[[x]],
  probs = c(0, 0.999),
  na.rm = T
)

qqnorm(y = val_train_Xy$`Win(GarageCars)`, ylab = 'Win_raw_x')
qqline(y = val_train_Xy$`Win(GarageCars)`, ylab = 'Win_raw_x')

print(shapiro.test(x = val_train_Xy[[x]]))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy[[x]]
## W = 0.83694, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`Win(GarageCars)`))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$`Win(GarageCars)`
## W = 0.83446, p-value < 2.2e-16

3.25.2 Binarize

val_train_Xy = val_train_Xy %>%
  mutate(GarageCars.bin = ifelse(GarageCars == 0, 0, 1))

x = 'GarageCars.bin'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
val_train_Xy$GarageCars.bin.fact =
  factor(val_train_Xy$GarageCars.bin, ordered = T)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = 'GarageCars.bin.fact', y = y)

## NULL
p_vals = get_signif_levels(
  data = val_train_Xy, x = 'GarageCars.bin.fact', z = y, min_n = 30
)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "1" "0"
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('GarageCars', 'Win(GarageCars)', 'GarageCars.bin')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##    GarageCars       Win(GarageCars)    GarageCars.bin    
##  Min.   :0.005684   Min.   :0.005378   Min.   :0.005967  
##  1st Qu.:0.163144   1st Qu.:0.163484   1st Qu.:0.051718  
##  Median :0.282048   Median :0.283037   Median :0.090024  
##  Mean   :0.306341   Mean   :0.307958   Mean   :0.118027  
##  3rd Qu.:0.419101   3rd Qu.:0.421339   3rd Qu.:0.148698  
##  Max.   :1.000000   Max.   :0.999361   Max.   :0.568588
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

That didn’t seem to help.

3.26 GarageArea

Back to top.

3.26.1 Normalize

Polymodal in interaction with GarageCars.

x = 'GarageArea'
summary(val_train_Xy[x])
##    GarageArea    
##  Min.   :   0.0  
##  1st Qu.: 312.0  
##  Median : 480.0  
##  Mean   : 469.3  
##  3rd Qu.: 576.0  
##  Max.   :1418.0
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 50,
  t_binw = 1
)

## NULL
x = 'sqrt(GarageArea)'
val_train_Xy = val_train_Xy %>%
  mutate('sqrt(GarageArea)' = sqrt(GarageArea))

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  sqrt(GarageArea)
##  Min.   : 0.00   
##  1st Qu.:17.66   
##  Median :21.91   
##  Mean   :20.68   
##  3rd Qu.:24.00   
##  Max.   :37.66
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1,
  t_binw = 1
)

## NULL

3.26.2 Winsorize

x = 'sqrt(GarageArea)'
df = filter(val_train_Xy, GarageArea != 0)

qqnorm(y = df$GarageArea, ylab = 'GarageArea')
qqline(y = df$GarageArea, ylab = 'GarageArea')

qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)

Win_sqrt_x = Winsorize(
  x = df[[x]],
  probs = c(0, 0.998),
  na.rm = T
)

qqnorm(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
qqline(y = Win_sqrt_x, ylab = 'Win_sqrt_x')

Win_raw_x = Winsorize(
  x = df$GarageArea,
  probs = c(0, 0.987),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')

print(shapiro.test(x = val_train_Xy$GarageArea))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$GarageArea
## W = 0.97888, p-value = 1.218e-08
print(shapiro.test(x = val_train_Xy$`sqrt(GarageArea)`))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$`sqrt(GarageArea)`
## W = 0.85021, p-value < 2.2e-16
print(shapiro.test(x = Win_sqrt_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_sqrt_x
## W = 0.98761, p-value = 1.749e-05
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.97029, p-value = 1.824e-10
min_val = min(Win_sqrt_x)
max_val = max(Win_sqrt_x)
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(sqrt(GarageArea))' = ifelse(
      GarageArea == 0,
      0,
      Winsorize(
        sqrt(GarageArea),
        minval = min_val,
        maxval = max_val
      )
    )
  )

3.26.3 Correlations

x = 'Win(sqrt(GarageArea))'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('GarageArea', 'sqrt(GarageArea)', x)

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##    GarageArea       sqrt(GarageArea)   Win(sqrt(GarageArea))
##  Min.   :0.006474   Min.   :0.001747   Min.   :0.001705     
##  1st Qu.:0.139205   1st Qu.:0.121113   1st Qu.:0.121213     
##  Median :0.324645   Median :0.268493   Median :0.264631     
##  Mean   :0.317963   Mean   :0.287737   Mean   :0.286712     
##  3rd Qu.:0.430672   3rd Qu.:0.375451   3rd Qu.:0.372716     
##  Max.   :0.873194   Max.   :0.875112   Max.   :0.876250
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    .data[[x]] != 0
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
##    GarageArea       sqrt(GarageArea)   Win(sqrt(GarageArea))
##  Min.   :0.002476   Min.   :0.002494   Min.   :0.002863     
##  1st Qu.:0.128551   1st Qu.:0.144903   1st Qu.:0.145087     
##  Median :0.337145   Median :0.322871   Median :0.319261     
##  Mean   :0.315598   Mean   :0.315749   Mean   :0.314570     
##  3rd Qu.:0.456501   3rd Qu.:0.470921   3rd Qu.:0.470155     
##  Max.   :0.819002   Max.   :0.837390   Max.   :0.840302     
##  NA's   :1          NA's   :1          NA's   :1
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')

y_lst = c('log(SalePrice)')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}

Transforming and Winsorizing doesn’t help the overall correlation to SalePrice. Let’s facet it out and see.

3.26.4 Garage Area and Cars by Type

df = filter(
  val_train_Xy,
  (GarageType %in% c('Attchd', 'Detchd', 'BuiltIn')) &
    (GarageCars != 4) # Ony one point in this subset with 4 and it doesn't really change the correlations, but it saves viz space.
)

ggplot(df, aes(x = `Win(sqrt(GarageArea))`, y = `log(SalePrice)`)) +
  geom_jitter(alpha = 0.5) +
  geom_smooth(method = 'lm') +
  facet_grid(rows = vars(GarageType), cols = vars(GarageCars.fact))

sum_df0 = df %>%
  summarize(
    n = n(),
    GarageArea_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$`GarageArea`
    ),
    sqrtArea_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$`sqrt(GarageArea)`
    ),
    WinsqrtArea_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$`Win(sqrt(GarageArea))`
    )
  )

min_n = 30

# sum_df1 = df %>%
#   group_by(GarageCars.fact, GarageType) %>%
#   summarize(
#     n = n(),
#     GarageArea_cor = cor(
#       x = .data$`Win(log(SalePrice))`,
#       y = .data$`GarageArea`
#     ),
#     sqrtArea_cor = cor(
#       x = .data$`Win(log(SalePrice))`,
#       y = .data$`sqrt(GarageArea)`
#     ),
#     WinsqrtArea_cor = cor(
#       x = .data$`Win(log(SalePrice))`,
#       y = .data$`Win(sqrt(GarageArea))`
#     )
#   ) %>%
#   filter(n >= min_n)

sum_df2 = df %>% group_by(GarageType) %>%
  summarize(
    n = n(),
    GarageArea_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$`GarageArea`
    ),
    sqrtArea_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$`sqrt(GarageArea)`
    ),
    WinsqrtArea_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$`Win(sqrt(GarageArea))`
    )
  ) %>%
  filter(n >= min_n)

sum_df3 = df %>% group_by(GarageCars.fact) %>%
  summarize(
    n = n(),
    GarageArea_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$`GarageArea`
    ),
    sqrtArea_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$`sqrt(GarageArea)`
    ),
    WinsqrtArea_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$`Win(sqrt(GarageArea))`
    )
  ) %>%
  filter(n >= min_n)

sum_df0 %>%
  merge(y = sum_df2, all = T) %>%
  merge(y = sum_df3, all = T) %>%
  # merge(y = sum_df1, all = T) %>%
  select(
    c('GarageType', 'GarageCars.fact', 'n', 'GarageArea_cor',
      'sqrtArea_cor', 'WinsqrtArea_cor')
  ) %>%
  arrange(GarageType, GarageCars.fact)

Transforming and Winsorizing GarageArea improves correlation to the target variable (“Win(log(SalePrice))”) somewhat when excluding houses without garages. Grouping by garage type helps in some cases.

Whether to transform the variable or not may depend on which ML algorithm we’re using and how. A decision tree will likely be indifferent to tranformations, though it may benefit from noise reduction with Winsorization. A linear regression will only benefit if type and/or missingness is factored in, as would KNN.

Grouping by number of cars lowers the area:price correlation to no correlation. And, in fact, number of cars has a stronger correlation to the target variable than area does. Let’s see how grouping by type further improves that correlation.

df = filter(
  val_train_Xy,
  (GarageType %in% c('Attchd', 'Detchd', 'BuiltIn'))
)

ggplot(df, aes(x = GarageCars, y = `log(SalePrice)`)) +
  geom_jitter(alpha = 0.5) +
  geom_smooth(method = 'lm') +
  facet_wrap(facets = vars(GarageType))

sum_df0 = df %>%
  summarize(
    n = n(),
    Cars_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$GarageCars
    ),
    WinCars_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$`Win(GarageCars)`
    ),
    WinsqrtArea_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$`Win(sqrt(GarageArea))`
    )
  )

sum_df2 = df %>% group_by(GarageType) %>%
  summarize(
    n = n(),
    Cars_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$GarageCars
    ),
    WinCars_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$`Win(GarageCars)`
    ),
    WinsqrtArea_cor = cor(
      x = .data$`Win(log(SalePrice))`,
      y = .data$`Win(sqrt(GarageArea))`
    )
  )

sum_df0 %>%
  merge(y = sum_df2, all = T) %>%
  select(
    c('GarageType', 'n', 'Cars_cor', 'WinCars_cor', 'WinsqrtArea_cor')
  ) %>%
  arrange(GarageType)

It looks worth dropping GarageArea in favor of GarageCars. There is an argument against linear regression using discrete variables, but it seems to work nonetheless.

Winsorizing GarageCars only produces a marginal benefit which may prove spurious as it only adjusts one point. Plus, it reduces normality. So, I’ll just use the raw feature.

There are likely other features at play, like year built, that also affect things like the difference in the prices of two-car attached garages and two-car detached garages.

fenced_jbv(
  data = df[df$GarageCars != 4, ],
  x = 'GarageType',
  y = 'log(SalePrice)',
  jit_col = 'YearBuilt.fact',
  leg_lbl = 'Year Built'
) +
  facet_wrap(facets = vars(GarageCars.fact))

val_train_Xy = select(
  val_train_Xy,
  -c('GarageCars.fact', 'GarageCars.plus1', 'GarageCars.bin',
     'GarageCars.bin.fact', 'GarageArea', 'sqrt(GarageArea)',
     'Win(sqrt(GarageArea))')
)

3.27 GarageQual, Cond

Back to top.

summary(val_train_Xy[ , c('GarageQual', 'GarageCond')])
##  GarageQual GarageCond
##  None: 40   None: 40  
##  Po  :  1   Po  :  4  
##  Fa  : 29   Fa  : 20  
##  TA  :633   TA  :646  
##  Gd  : 11   Gd  :  4  
##  Ex  :  1   Ex  :  1
val_train_Xy = select(val_train_Xy, -c('GarageQual', 'GarageCond'))

3.28 PavedDrive

Back to top.

summary(val_train_Xy$PavedDrive)
## None    N    P    Y 
##    0   53   19  643
val_train_Xy = select(val_train_Xy, -c('PavedDrive'))

3.29 WoodDeckSF

Back to top.

3.29.1 Normalize

x = 'WoodDeckSF'
summary(val_train_Xy[x])
##    WoodDeckSF    
##  Min.   :  0.00  
##  1st Qu.:  0.00  
##  Median :  0.00  
##  Mean   : 96.78  
##  3rd Qu.:175.50  
##  Max.   :736.00
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy$WoodDeckSF != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 10,
  t_binw = .1
)

## NULL
x = 'cbrt(WoodDeckSF)'
val_train_Xy = val_train_Xy %>%
  mutate('cbrt(WoodDeckSF)' = (WoodDeckSF)^(1/3))

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  cbrt(WoodDeckSF)
##  Min.   :0.000   
##  1st Qu.:0.000   
##  Median :0.000   
##  Mean   :2.712   
##  3rd Qu.:5.599   
##  Max.   :9.029
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy[[x]] != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = .1,
  t_binw = .1
)

## NULL

3.29.2 Winsorize

df = filter(val_train_Xy, WoodDeckSF != 0)

qqnorm(y = df$WoodDeckSF, ylab = 'WoodDeckSF')
qqline(y = df$WoodDeckSF, ylab = 'WoodDeckSF')

qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)

Win_cbrt_x = Winsorize(
  x = df[[x]],
  probs = c(0.01, 0.99),
  na.rm = T
)

qqnorm(y = Win_cbrt_x, ylab = x)
qqline(y = Win_cbrt_x, ylab = x)

Win_raw_x = Winsorize(
  x = df$WoodDeckSF,
  probs = c(0, 0.95),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'WoodDeckSF')
qqline(y = Win_raw_x, ylab = 'WoodDeckSF')

print(shapiro.test(x = df$WoodDeckSF))
## 
##  Shapiro-Wilk normality test
## 
## data:  df$WoodDeckSF
## W = 0.88365, p-value = 1.951e-15
print(shapiro.test(x = df[[x]]))
## 
##  Shapiro-Wilk normality test
## 
## data:  df[[x]]
## W = 0.98712, p-value = 0.003914
print(shapiro.test(x = Win_cbrt_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_cbrt_x
## W = 0.98648, p-value = 0.002765
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.93225, p-value = 2.343e-11
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(cbrt(WoodDeckSF))' = ifelse(
      WoodDeckSF == 0,
      0,
      Winsorize(
        WoodDeckSF^(1/3),
        minval = min(Win_cbrt_x),
        maxval = max(Win_cbrt_x)
      )
    )
  ) %>%
  mutate(
    'Win(WoodDeckSF)' = ifelse(
      WoodDeckSF == 0,
      0,
      Winsorize(
        WoodDeckSF,
        minval = min(Win_raw_x),
        maxval = max(Win_raw_x)
      )
    )
  )

3.29.3 Binarize

val_train_Xy = val_train_Xy %>%
  mutate('WoodDeck.bin' = ifelse(WoodDeckSF == 0, 0, 1)) %>%
  mutate('WoodDeck.bin.fact' = factor(WoodDeck.bin, ordered = T))

x = 'WoodDeck.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "0" "1"

3.29.4 Correlations

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('WoodDeckSF', 'cbrt(WoodDeckSF)', 'Win(cbrt(WoodDeckSF))',
          'Win(WoodDeckSF)', 'WoodDeck.bin')
x = 'Win(cbrt(WoodDeckSF))'

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##    WoodDeckSF       cbrt(WoodDeckSF)   Win(cbrt(WoodDeckSF)) Win(WoodDeckSF)   
##  Min.   :0.006145   Min.   :0.004212   Min.   :0.004404      Min.   :0.001863  
##  1st Qu.:0.084695   1st Qu.:0.069548   1st Qu.:0.069704      1st Qu.:0.082133  
##  Median :0.176322   Median :0.161833   Median :0.161822      Median :0.179367  
##  Mean   :0.160453   Mean   :0.165207   Mean   :0.165111      Mean   :0.163279  
##  3rd Qu.:0.234533   3rd Qu.:0.252250   3rd Qu.:0.251680      3rd Qu.:0.242096  
##  Max.   :0.328898   Max.   :0.355420   Max.   :0.355488      Max.   :0.339011  
##   WoodDeck.bin     
##  Min.   :0.002522  
##  1st Qu.:0.055760  
##  Median :0.161167  
##  Mean   :0.149999  
##  3rd Qu.:0.228064  
##  Max.   :0.330345
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    .data[[x]] != 0
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
##    WoodDeckSF       cbrt(WoodDeckSF)   Win(cbrt(WoodDeckSF)) Win(WoodDeckSF)   
##  Min.   :0.001777   Min.   :0.002974   Min.   :0.004668      Min.   :0.001231  
##  1st Qu.:0.060621   1st Qu.:0.056354   1st Qu.:0.056724      1st Qu.:0.060845  
##  Median :0.112829   Median :0.144985   Median :0.142206      Median :0.122578  
##  Mean   :0.114102   Mean   :0.123596   Mean   :0.124123      Mean   :0.116930  
##  3rd Qu.:0.157044   3rd Qu.:0.172657   3rd Qu.:0.169070      3rd Qu.:0.155611  
##  Max.   :0.285351   Max.   :0.298849   Max.   :0.302455      Max.   :0.304426  
##                                                                                
##   WoodDeck.bin
##  Min.   : NA  
##  1st Qu.: NA  
##  Median : NA  
##  Mean   :NaN  
##  3rd Qu.: NA  
##  Max.   : NA  
##  NA's   :55
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')

y_lst = c('Win(log(SalePrice))')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
  # plot_scat_pairs(
  #   df = filter(val_train_Xy, WoodDeckSF != 0),
  #   x = feat,
  #   y_lst = y_lst
  # )
}

There’s no real correlation between deck square footage and the target price when you remove houses without decks; the binary version does most of the work. But, the transformation does it a little better and does offer more normalized distance between points for KNN.

3.29.5 Hard Code

x = 'Win(cbrt(WoodDeckSF))'

min_val = min(Win_cbrt_x)
max_val = max(Win_cbrt_x)
print(paste("min_val:", min_val))
## [1] "min_val: 2.99287415882937"
print(paste("max_val:", max_val))
## [1] "max_val: 8.48252791006478"
# Already hard coded above

val_train_Xy = select(
  val_train_Xy,
  -c('WoodDeck.bin', 'WoodDeck.bin.fact', 'Win(WoodDeckSF)')
)

ggplot(val_train_Xy, aes(x = .data[[x]])) +
  geom_histogram(binwidth = .25)

3.30 OpenPorchSF

Back to top.

3.30.1 Normalize

x = 'OpenPorchSF'
summary(val_train_Xy[x])
##   OpenPorchSF    
##  Min.   :  0.00  
##  1st Qu.:  0.00  
##  Median : 27.00  
##  Mean   : 47.46  
##  3rd Qu.: 72.00  
##  Max.   :547.00
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy$OpenPorchSF != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 10,
  t_binw = 0.1
)

## NULL
x = 'cbrt(OpenPorchSF)'
val_train_Xy = val_train_Xy %>%
  mutate('cbrt(OpenPorchSF)' = OpenPorchSF^(1/3))

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  cbrt(OpenPorchSF)
##  Min.   :0.000    
##  1st Qu.:0.000    
##  Median :3.000    
##  Mean   :2.276    
##  3rd Qu.:4.160    
##  Max.   :8.178
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy[[x]] != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 0.1,
  t_binw = 0.1
)

## NULL

3.30.2 Winsorize

df = filter(val_train_Xy, OpenPorchSF != 0)
x = 'cbrt(OpenPorchSF)'

qqnorm(y = df$OpenPorchSF, ylab = 'OpenPorchSF')
qqline(y = df$OpenPorchSF, ylab = 'OpenPorchSF')

qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)

Win_cbrt_x = Winsorize(
  x = df[[x]],
  probs = c(0.001, 0.994),
  na.rm = T
)

qqnorm(y = Win_cbrt_x, ylab = x)
qqline(y = Win_cbrt_x, ylab = x)

Win_raw_x = Winsorize(
  x = df$OpenPorchSF,
  probs = c(0, 0.95),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'OpenPorchSF')
qqline(y = Win_raw_x, ylab = 'OpenPorchSF')

print(shapiro.test(x = df$OpenPorchSF))
## 
##  Shapiro-Wilk normality test
## 
## data:  df$OpenPorchSF
## W = 0.79866, p-value < 2.2e-16
print(shapiro.test(x = df[[x]]))
## 
##  Shapiro-Wilk normality test
## 
## data:  df[[x]]
## W = 0.97116, p-value = 5.932e-07
print(shapiro.test(x = Win_cbrt_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_cbrt_x
## W = 0.97391, p-value = 1.921e-06
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.89182, p-value = 6.251e-16
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(cbrt(OpenPorchSF))' = ifelse(
      OpenPorchSF == 0,
      0,
      Winsorize(
        OpenPorchSF^(1/3),
        # probs = c(0.001, 0.994),
        # na.rm = T
        minval = min(Win_cbrt_x),
        maxval = max(Win_cbrt_x)
      )
    )
  )

Looks like some polymodality happening. Year built doesn’t seem to explain it. I’m not going to manually search for it any further, but a decision tree or other ML algorithm may find and “factor” in the hidden interaction.

ggplot(
  filter(val_train_Xy, OpenPorchSF != 0),
  aes(x = `cbrt(OpenPorchSF)`)
) +
  geom_histogram(binwidth = 0.1) +
  facet_wrap(facets = vars(YearBuilt.fact), ncol = 1)

ggplot(
  filter(val_train_Xy, OpenPorchSF != 0),
  aes(x = `cbrt(OpenPorchSF)`, y = `log(SalePrice)`)
) +
  geom_point(aes(color = YearBuilt))

3.30.3 Binarize

val_train_Xy = val_train_Xy %>%
  mutate('OpenPorch.bin' = ifelse(OpenPorchSF == 0, 0, 1)) %>%
  mutate('OpenPorch.bin.fact' = factor(OpenPorch.bin, ordered = T))

x = 'OpenPorch.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "1" "0"

3.30.4 Correlations

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('OpenPorchSF', 'cbrt(OpenPorchSF)', 'Win(cbrt(OpenPorchSF))',
          'OpenPorch.bin')
x = 'Win(cbrt(OpenPorchSF))'

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##   OpenPorchSF       cbrt(OpenPorchSF)  Win(cbrt(OpenPorchSF))
##  Min.   :0.004127   Min.   :0.003166   Min.   :0.002332      
##  1st Qu.:0.059251   1st Qu.:0.090986   1st Qu.:0.091328      
##  Median :0.142723   Median :0.167835   Median :0.168485      
##  Mean   :0.151759   Mean   :0.202395   Mean   :0.202442      
##  3rd Qu.:0.232678   3rd Qu.:0.325809   3rd Qu.:0.326224      
##  Max.   :0.339325   Max.   :0.460438   Max.   :0.459971      
##  OpenPorch.bin     
##  Min.   :0.000127  
##  1st Qu.:0.082138  
##  Median :0.152291  
##  Mean   :0.203538  
##  3rd Qu.:0.356361  
##  Max.   :0.465546
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    .data[[x]] != 0
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
##   OpenPorchSF       cbrt(OpenPorchSF)   Win(cbrt(OpenPorchSF)) OpenPorch.bin
##  Min.   :0.002912   Min.   :0.0005588   Min.   :0.001252       Min.   : NA  
##  1st Qu.:0.037026   1st Qu.:0.0371173   1st Qu.:0.033447       1st Qu.: NA  
##  Median :0.070241   Median :0.0705708   Median :0.067662       Median : NA  
##  Mean   :0.074408   Mean   :0.0750998   Mean   :0.073149       Mean   :NaN  
##  3rd Qu.:0.106605   3rd Qu.:0.1177230   3rd Qu.:0.112676       3rd Qu.: NA  
##  Max.   :0.192231   Max.   :0.2019503   Max.   :0.191297       Max.   : NA  
##                                                                NA's   :57
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')

y_lst = c('Win(log(SalePrice))')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
  plot_scat_pairs(
    df = val_train_Xy[val_train_Xy[[x]] != 0, ],
    x = feat,
    y_lst = y_lst
  )
}

In this case, the binary variable is doing all the work.

val_train_Xy = select(
  val_train_Xy,
  -c('cbrt(OpenPorchSF)', 'Win(cbrt(OpenPorchSF))', 'OpenPorch.bin.fact')
)

3.31 EnclosedPorch

Back to top.

3.31.1 Normalize

x = 'EnclosedPorch'
summary(val_train_Xy[x])
##  EnclosedPorch   
##  Min.   :  0.00  
##  1st Qu.:  0.00  
##  Median :  0.00  
##  Mean   : 19.09  
##  3rd Qu.:  0.00  
##  Max.   :294.00
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy$EnclosedPorch != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 5,
  t_binw = 5
)

## NULL

3.31.2 Winsorize

df = filter(val_train_Xy, EnclosedPorch != 0)

qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)

Win_raw_x = Winsorize(
  x = df$EnclosedPorch,
  probs = c(0.001, 0.999),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')

print(shapiro.test(x = df[[x]]))
## 
##  Shapiro-Wilk normality test
## 
## data:  df[[x]]
## W = 0.98306, p-value = 0.2587
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.9827, p-value = 0.2437

3.31.3 Binarize

val_train_Xy = val_train_Xy %>%
  mutate('EnclosedPorch.bin' = ifelse(EnclosedPorch == 0, 0, 1)) %>%
  mutate(
    'EnclosedPorch.bin.fact' = factor(
      EnclosedPorch.bin,
      ordered = T
    )
  )

x = 'EnclosedPorch.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "0" "1"

3.31.4 Correlations

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('EnclosedPorch', 'EnclosedPorch.bin')
x = 'EnclosedPorch'

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##  EnclosedPorch      EnclosedPorch.bin 
##  Min.   :0.004006   Min.   :0.001413  
##  1st Qu.:0.025521   1st Qu.:0.043374  
##  Median :0.069574   Median :0.095328  
##  Mean   :0.087443   Mean   :0.113655  
##  3rd Qu.:0.138839   3rd Qu.:0.165249  
##  Max.   :0.371542   Max.   :0.458928
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    .data[[x]] != 0
  ),
  x_lst = c(x),
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
##  EnclosedPorch     
##  Min.   :0.005257  
##  1st Qu.:0.089806  
##  Median :0.169637  
##  Mean   :0.184809  
##  3rd Qu.:0.248067  
##  Max.   :0.474650  
##  NA's   :3
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')

y_lst = c('log(SalePrice)')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
  plot_scat_pairs(
    df = val_train_Xy[val_train_Xy[[x]] != 0, ],
    x = feat,
    y_lst = y_lst
  )
}

Houses with enclosed porches are significantly cheaper than those without, maybe due to a confounding variable like year built. There is a weak, if existent, positive correlation between square footage and price within those that have them.

fenced_jbv(
  data = val_train_Xy,
  x = 'EnclosedPorch.bin.fact',
  y = 'log(SalePrice)',
  jit_col = 'YearBuilt',
  jit_alpha = 1
) +
  facet_wrap(facets = vars(YearBuilt.fact))

val_train_Xy$resids = lm(
  `log(SalePrice)` ~ `sqrt(Age)`,
  val_train_Xy
)$residuals

ggplot(val_train_Xy, aes(x = EnclosedPorch, y = resids)) +
  geom_point() +
  geom_smooth(method = 'lm')

val_train_Xy = select(
  val_train_Xy, -c('EnclosedPorch.bin', 'EnclosedPorch.bin.fact')
)

3.32 X3SsnPorch

Back to top.

summary(val_train_Xy$X3SsnPorch)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   4.505   0.000 508.000
val_train_Xy = select(val_train_Xy, -c('X3SsnPorch'))

3.33 ScreenPorch

Back to top.

3.33.1 Normalize

x = 'ScreenPorch'
summary(val_train_Xy[x])
##   ScreenPorch    
##  Min.   :  0.00  
##  1st Qu.:  0.00  
##  Median :  0.00  
##  Mean   : 16.48  
##  3rd Qu.:  0.00  
##  Max.   :480.00
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy$ScreenPorch != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 10,
  t_binw = 0.1
)

## NULL
x = 'cbrt(ScreenPorch)'
val_train_Xy = val_train_Xy %>%
  mutate('cbrt(ScreenPorch)' = ScreenPorch^(1/3))

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  cbrt(ScreenPorch)
##  Min.   :0.0000   
##  1st Qu.:0.0000   
##  Median :0.0000   
##  Mean   :0.4832   
##  3rd Qu.:0.0000   
##  Max.   :7.8297
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy$`cbrt(ScreenPorch)` != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 0.1,
  t_binw = 0.1
)

## NULL

3.33.2 Winsorize

x = 'cbrt(ScreenPorch)'
df = filter(val_train_Xy, ScreenPorch != 0) 

qqnorm(y = df$ScreenPorch, ylab = 'ScreenPorch')
qqline(y = df$ScreenPorch)

qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]])

Win_cbrt_x = Winsorize(
  df[[x]],
  probs = c(0.05, 0.95),
  na.rm = T
)

qqnorm(y = Win_cbrt_x, ylab = 'Win_cbrt_x')
qqline(y = Win_cbrt_x)

Win_raw_x = Winsorize(
  df$ScreenPorch,
  probs = c(0.05, 0.95),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x)

print(shapiro.test(x = df$ScreenPorch))
## 
##  Shapiro-Wilk normality test
## 
## data:  df$ScreenPorch
## W = 0.92914, p-value = 0.001652
print(shapiro.test(x = df[[x]]))
## 
##  Shapiro-Wilk normality test
## 
## data:  df[[x]]
## W = 0.97514, p-value = 0.2487
print(shapiro.test(x = Win_cbrt_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_cbrt_x
## W = 0.96719, p-value = 0.1008
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.92848, p-value = 0.001547
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(cbrt(ScreenPorch))' = ifelse(
      ScreenPorch == 0,
      0,
      Winsorize(
        ScreenPorch^(1/3),
        minval = min(Win_cbrt_x),
        maxval = max(Win_cbrt_x)
      )
    )
  )

3.33.3 Binarize

val_train_Xy = val_train_Xy %>%
  mutate('ScreenPorch.bin' = ifelse(ScreenPorch ==0, 0, 1)) %>%
  mutate('ScreenPorch.bin.fact' = factor(ScreenPorch.bin, ordered = T))

x = 'ScreenPorch.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "0" "1"

3.33.4 Correlations

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('ScreenPorch', 'cbrt(ScreenPorch)', 'Win(cbrt(ScreenPorch))',
          'ScreenPorch.bin')
x = 'Win(cbrt(ScreenPorch))'

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##   ScreenPorch       cbrt(ScreenPorch)  Win(cbrt(ScreenPorch))
##  Min.   :0.000127   Min.   :0.001702   Min.   :0.0007076     
##  1st Qu.:0.033205   1st Qu.:0.032575   1st Qu.:0.0310287     
##  Median :0.052721   Median :0.054455   Median :0.0552326     
##  Mean   :0.062430   Mean   :0.060759   Mean   :0.0601016     
##  3rd Qu.:0.074751   3rd Qu.:0.083482   3rd Qu.:0.0840852     
##  Max.   :0.225009   Max.   :0.226688   Max.   :0.2256480     
##  ScreenPorch.bin   
##  Min.   :0.001074  
##  1st Qu.:0.026853  
##  Median :0.056082  
##  Mean   :0.057789  
##  3rd Qu.:0.086152  
##  Max.   :0.220821
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    .data[[x]] != 0
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
##   ScreenPorch      cbrt(ScreenPorch)  Win(cbrt(ScreenPorch)) ScreenPorch.bin
##  Min.   :0.01673   Min.   :0.003906   Min.   :0.0138         Min.   : NA    
##  1st Qu.:0.09574   1st Qu.:0.106601   1st Qu.:0.1020         1st Qu.: NA    
##  Median :0.14405   Median :0.143076   Median :0.1361         Median : NA    
##  Mean   :0.14621   Mean   :0.159218   Mean   :0.1533         Mean   :NaN    
##  3rd Qu.:0.20818   3rd Qu.:0.233239   3rd Qu.:0.2118         3rd Qu.: NA    
##  Max.   :0.37341   Max.   :0.325564   Max.   :0.2796         Max.   : NA    
##  NA's   :1         NA's   :1          NA's   :1              NA's   :57
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')

y_lst = c('Win(log(SalePrice))')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
  plot_scat_pairs(
    df = val_train_Xy[val_train_Xy[[x]] != 0, ],
    x = feat,
    y_lst = y_lst
  )
}

The binary doesn’t seem to make a significant difference as other binaries like WoodDeck and OpenPorch. So, it’s not worth using alone, but might be useful in interaction with the area in a linear regression. Decision trees should be able to do without the binary feature. The transformation may help KNN, but like a decision tree, I would want to avoid overweighting by including both.

3.33.5 “Controlling”

fenced_jbv(
  data = val_train_Xy,
  x = 'ScreenPorch.bin.fact',
  y = 'log(SalePrice)',
  jit_col = 'YearBuilt',
  jit_alpha = 1
) +
  facet_wrap(facets = vars(YearBuilt.fact))

val_train_Xy$resids = lm(
  `log(SalePrice)` ~ `sqrt(Age)`,
  val_train_Xy
)$residuals

ggplot(val_train_Xy, aes(x = `cbrt(ScreenPorch)`, y = resids)) +
  geom_point() +
  geom_smooth(method = 'lm')

print("Correlation of cbrt(ScreenPorch) to residuals of `log(SalePrice)` ~ `sqrt(Age)`")
## [1] "Correlation of cbrt(ScreenPorch) to residuals of `log(SalePrice)` ~ `sqrt(Age)`"
print(cor(x = val_train_Xy$`cbrt(ScreenPorch)`, y = val_train_Xy$resids))
## [1] 0.2266884
df = filter(val_train_Xy, ScreenPorch != 0)

ggplot(df, aes(x = `cbrt(ScreenPorch)`, y = resids)) +
  geom_point() +
  geom_smooth(method = 'lm')

print("Exluding ScreenPorch 0s:")
## [1] "Exluding ScreenPorch 0s:"
print(cor(x = val_train_Xy$`cbrt(ScreenPorch)`, y = val_train_Xy$resids))
## [1] 0.2266884

This feature doesn’t seem to have much to offer. But, I’ll leave it anyway and let feature selection during modeling suss that out.

val_train_Xy = select(
  val_train_Xy, -c('ScreenPorch.bin.fact', 'Win(cbrt(ScreenPorch))')
)

3.34 PoolArea, QC

Back to top.

summary(val_train_Xy$PoolQC)
## None   Po   Fa   TA   Gd   Ex 
##  712    0    1    0    1    1
val_train_Xy = select(val_train_Xy, -c('PoolArea', 'PoolQC'))

3.35 Fence

Back to top.

x = 'Fence'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "None"  "MnPrv"

3.35.1 Binarize

val_train_Xy = val_train_Xy %>%
  mutate('Fence.bin' = ifelse(Fence == 'None', 0, 1)) %>%
  mutate('Fence.bin.fact' = factor(Fence.bin, ordered = T))

x = 'Fence.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "0" "1"

Having a Fence seems to detract value, probably due to an interaction with another variable as we saw with EnclosedPorch and age. Unlike EnclosedPorch, it’s not immediately obvious what the other variable is, and “controlling” for it with a linear regression may actually increase Fence’s significance. So, rather than hunting for the other variable(s) or dropping this one, I’ll leave it and see if ML modeling can make use of it.

val_train_Xy = select(
  val_train_Xy,
  -c('Fence.bin', 'Fence.bin.fact')
)

3.36 MiscFeature, MiscVal

Back to top.

MiscVal is kind of a cheater variable. It should have precisely 1 for its coefficient. Otherwise, I would just drop it for so few observations; it might just throw of the regression. If I keep it, it should be transformed in the same way that the target variable is.

It also looks like the presence of a miscellaneous improvement is associated with a lower price if anything.

x = 'MiscFeature'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)

3.37 MiscVal

Back to top.

3.37.1 Normalize

x = 'MiscVal'
summary(val_train_Xy[x])
##     MiscVal        
##  Min.   :    0.00  
##  1st Qu.:    0.00  
##  Median :    0.00  
##  Mean   :   54.19  
##  3rd Qu.:    0.00  
##  Max.   :15500.00
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy$MiscVal != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 200,
  t_binw = .1
)

## NULL
x = 'log2(MiscVal)'
val_train_Xy = val_train_Xy %>%
  mutate(
    'log2(MiscVal)' = ifelse(
      MiscVal == 0,
      0,
      log2(MiscVal)
    )
  )

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  log2(MiscVal)    
##  Min.   : 0.0000  
##  1st Qu.: 0.0000  
##  Median : 0.0000  
##  Mean   : 0.2708  
##  3rd Qu.: 0.0000  
##  Max.   :13.9200
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy$`log2(MiscVal)` != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = .1,
  t_binw = .1
)

## NULL

Alright, I’ll give it a natural log like SalePrice, since it is a straight dollar value.

x = 'log(MiscVal)'
val_train_Xy = val_train_Xy %>%
  mutate(
    'log(MiscVal)' = ifelse(
      MiscVal == 0,
      0,
      log(MiscVal)
    )
  )

df = filter(val_train_Xy, MiscVal != 0)

gg = ggplot(df, aes(x = `log(MiscVal)`))
p1 = gg + geom_histogram(binwidth = 0.1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)

3.37.2 Winsorize

Since this variable is an actual dollar value, Winsorizing it doesn’t really make sense. I’ll check it out anyway.

qqnorm(y = df$MiscVal, ylab = 'MiscVal')
qqline(y = df$MiscVal)

qqnorm(y = df$`log(MiscVal)`, ylab = 'log(MiscVal)')
qqline(y = df$`log(MiscVal)`)

Win_log_x = Winsorize(
  df$`log(MiscVal)`,
  probs = c(0.007, 0.95),
  na.rm = T
)

qqnorm(y = Win_log_x, ylab = 'Win_log_x')
qqline(y = Win_log_x)

Win_raw_x = Winsorize(
  df$MiscVal,
  probs = c(0, 0.95)
)

qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x)

print(shapiro.test(x = df$MiscVal))
## 
##  Shapiro-Wilk normality test
## 
## data:  df$MiscVal
## W = 0.49238, p-value = 2.714e-07
print(shapiro.test(x = df$`log(MiscVal)`))
## 
##  Shapiro-Wilk normality test
## 
## data:  df$`log(MiscVal)`
## W = 0.88923, p-value = 0.02603
print(shapiro.test(x = Win_log_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_log_x
## W = 0.89412, p-value = 0.03203
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.5586, p-value = 1.129e-06
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(log(MiscVal))' = ifelse(
      MiscVal == 0,
      0,
      Winsorize(
        log(MiscVal),
        minval = min(Win_log_x),
        maxval = max(Win_log_x)
      )
    )
  )

3.37.3 Binarize

val_train_Xy = val_train_Xy %>%
  mutate('MiscVal.bin' = ifelse(MiscVal == 0, 0, 1)) %>%
  mutate('MiscVal.bin.fact' = factor(MiscVal.bin, ordered = T))

x = 'MiscVal.bin.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)

3.37.4 Correlations

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('MiscVal', 'log2(MiscVal)', 'log(MiscVal)',
          'Win(log(MiscVal))', 'MiscVal.bin')

x = 'log(MiscVal)'

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##     MiscVal         log2(MiscVal)       log(MiscVal)      Win(log(MiscVal)) 
##  Min.   :0.001150   Min.   :0.001803   Min.   :0.001803   Min.   :0.002117  
##  1st Qu.:0.006437   1st Qu.:0.013649   1st Qu.:0.013649   1st Qu.:0.013673  
##  Median :0.013835   Median :0.028586   Median :0.028586   Median :0.028980  
##  Mean   :0.018486   Mean   :0.032804   Mean   :0.032804   Mean   :0.032855  
##  3rd Qu.:0.025099   3rd Qu.:0.047086   3rd Qu.:0.047086   3rd Qu.:0.046462  
##  Max.   :0.083097   Max.   :0.087143   Max.   :0.087143   Max.   :0.086933  
##   MiscVal.bin     
##  Min.   :0.00218  
##  1st Qu.:0.01782  
##  Median :0.03383  
##  Mean   :0.03774  
##  3rd Qu.:0.05123  
##  Max.   :0.09574
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    .data[[x]] != 0
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
##     MiscVal         log2(MiscVal)       log(MiscVal)      Win(log(MiscVal)) 
##  Min.   :0.006626   Min.   :0.009576   Min.   :0.009576   Min.   :0.001031  
##  1st Qu.:0.050062   1st Qu.:0.121361   1st Qu.:0.121361   1st Qu.:0.114223  
##  Median :0.123861   Median :0.233112   Median :0.233112   Median :0.264792  
##  Mean   :0.139666   Mean   :0.254203   Mean   :0.254203   Mean   :0.270993  
##  3rd Qu.:0.190636   3rd Qu.:0.369457   3rd Qu.:0.369457   3rd Qu.:0.415090  
##  Max.   :0.458312   Max.   :0.590246   Max.   :0.590246   Max.   :0.610388  
##                                                                             
##   MiscVal.bin 
##  Min.   : NA  
##  1st Qu.: NA  
##  Median : NA  
##  Mean   :NaN  
##  3rd Qu.: NA  
##  Max.   : NA  
##  NA's   :58
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')

y_lst = c('Win(log(SalePrice))')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
  plot_scat_pairs(
    df = val_train_Xy[val_train_Xy[[x]] != 0, ],
    x = feat,
    y_lst = y_lst
  )
}

3.37.5 “Controlling”

MiscVal appears to be correlated with the size of the lot and house. Perhaps that will do the heavy lifting and make MiscVal obsolete.

val_train_Xy$resids = lm(
  `log(SalePrice)` ~ `Win(LotArea)`,
  val_train_Xy
)$residuals

print("Correlation of log(MiscVal) to residuals of `log(SalePrice)` ~ `Win(LotArea)`")
## [1] "Correlation of log(MiscVal) to residuals of `log(SalePrice)` ~ `Win(LotArea)`"
print(cor(x = val_train_Xy$`log(MiscVal)`, y = val_train_Xy$resids))
## [1] -0.08294709
ggplot(val_train_Xy, aes(x = `log(MiscVal)`, y = resids)) +
  geom_point() +
  geom_smooth(method = 'lm') +
  ylab(label = "`log(SalePrice)` ~ `Win(LotArea)`")

df = filter(val_train_Xy, MiscVal != 0)

print("Exluding MiscVal 0s:")
## [1] "Exluding MiscVal 0s:"
print(cor(x = df$`log(MiscVal)`, y = df$resids))
## [1] 0.4032275
ggplot(df, aes(x = `log(MiscVal)`, y = resids)) +
  geom_point() +
  geom_smooth(method = 'lm') +
  ylab(label = "`log(SalePrice)` ~ `Win(LotArea)`")

val_train_Xy$resids = lm(
  `log(SalePrice)` ~ `log10(log10(LotArea))`,
  val_train_Xy
)$residuals

print("Correlation of log(MiscVal) to residuals of `log(SalePrice)` ~ `log10(log10(LotArea))`")
## [1] "Correlation of log(MiscVal) to residuals of `log(SalePrice)` ~ `log10(log10(LotArea))`"
print(cor(x = val_train_Xy$`log(MiscVal)`, y = val_train_Xy$resids))
## [1] -0.0788646
ggplot(val_train_Xy, aes(x = `log(MiscVal)`, y = resids)) +
  geom_point() +
  geom_smooth(method = 'lm') +
  ylab(label = "`log(SalePrice)` ~ `log10(log10(LotArea))`")

df = filter(val_train_Xy, MiscVal != 0)

print("Exluding MiscVal 0s:")
## [1] "Exluding MiscVal 0s:"
print(cor(x = df$`log(MiscVal)`, y = df$resids))
## [1] 0.4945909
ggplot(df, aes(x = `log(MiscVal)`, y = resids)) +
  geom_point() +
  geom_smooth(method = 'lm') +
  ylab(label = "`log(SalePrice)` ~ `log10(log10(LotArea))`")

After factoring in Lot Area, there’s still some correlation between MiscVal and the target once 0s are removed. It may still be worth keeping. We’ll let modeling suss that out.

val_train_Xy = select(
  val_train_Xy, -c('MiscVal.bin', 'MiscVal.bin.fact', 'log2(MiscVal)')
  )

3.38 MoSold, YrSold

Back to top.

MoSold somewhat normally distributed around June/July. Conventional wisdom says that summer sales are higher in volume and price, but the data don’t bear that out for price.

Records go from January 2006 through July 2010, so August-December are underrepresented by roughly 20%.

YrSold pretty uniform (about 140-160) except for 2010 which ended in July in this set.

Interesting spike in sales in Spring 2010. Foreclosures coming onto market?

3.38.1 Set to Factors: MoSold, YrSold

val_train_Xy = val_train_Xy %>%
  mutate('MoSold.fact' = factor(MoSold)) %>%
  mutate('YrSold.fact' = factor(YrSold, ordered = T))

x = 'MoSold.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "1"  "7"  "5"  "9"  "12"
x = 'YrSold.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)

3.39 SoldDate

Back to top.

val_train_Xy = val_train_Xy %>%
  mutate(
    SoldDate = as.Date(
      paste(
        as.character(YrSold),
        as.character(MoSold),
        '15',
        sep = '/'
      ),
      format = '%Y/%m/%d'
    )
  )

ggplot(val_train_Xy, aes(x = SoldDate)) +
  geom_bar()

x = 'SoldDate'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('SoldDate')
# 
# df = get_cors(
#   data = filter(
#     select(val_train_Xy, all_of(num_feats)),
#     !is.na(.data[[x]])
#   ),
#   x_lst = x_lst,
#   feats = num_feats
# )
# df
# print("Summary of absolute values of Pearson's Rs:")
# df = abs(df)
# summary(abs(df))
# 
# df = melt(df)
# ggplot(df, aes(x = variable, y = value)) +
#   geom_boxplot(notch = T) +
#   ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = c('log(SalePrice)')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)

## NULL
ggplot(
  val_train_Xy,
  aes(x = SoldDate, y = `log(SalePrice)`)
) +
  geom_point() +
  geom_smooth() +
  geom_smooth(method = 'lm', color = 'Yellow') +
  facet_grid(rows = vars(SaleCondition), cols = vars(SaleType))

None of date seems to matter in this set. Drop it, but no yet.

3.40 SaleType

Back to top.

x = 'SaleType'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "WD"  "New"

df = val_train_Xy[val_train_Xy$SaleType %in% c('WD', 'New', 'COD'), ]
gg = ggplot(df, aes(x = SaleType))

gg + geom_bar() +
  facet_grid(rows = vars(MoSold), cols = vars(YrSold)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

gg + geom_bar(position = 'dodge', aes(fill = factor(YrSold)))

ggplot(df, aes(color = SaleType, x = SoldDate)) +
  geom_freqpoly()

# ggplot(df, aes(x = SaleType, y = SalePrice)) +
#   geom_col(position = 'dodge', aes(fill = factor(YrSold), stat = 'mean'))

ggplot(df, aes(x = SaleType, y = `log(SalePrice)`)) +
  geom_boxplot(position = 'dodge', notch = T, aes(color = factor(YrSold)))

You can see new home sales drop off with the crash. Court officer deeds also ticked up, possibly due to more foreclosures. But, there didn’t seem to be a significant difference in sale prices year over year within sale type groups, except between new sales in 2007and 2010 which is not fully represented in this set.

3.41 SaleCondition

Back to top.

592 normal, 61 partial (home not completed when last assessed), 43 abnormal (trade, foreclosure, short sale), 11 family. I’m guessing a family sale will be lower in price typically, as will foreclosures and shortsales. I’m not sure what to make of partials; the house wasn’t fully built when assessed, so the price may be askew?

Surprisingly, abnormal sales didn’t seem to vary with the crash. Ames appears to have fared well.

x = 'SaleCondition'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Normal"  "Abnorml" "Partial"
df = filter(val_train_Xy, SaleCondition %in% c('Normal', 'Abnorml', 'Partial'))

gg = ggplot(df, aes(x = SaleCondition))

gg + geom_bar() +
  facet_grid(rows = vars(MoSold), cols = vars(YrSold)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

gg + geom_bar(aes(fill = factor(YrSold)), position = 'dodge')

ggplot(df, aes(x = SoldDate, color = SaleCondition)) +
  geom_freqpoly()

ggplot(df, aes(x = SaleCondition, y = `log(SalePrice)`)) +
  geom_boxplot(position = 'dodge', notch = T, aes(color = factor(YrSold)))

Bearing in mind that 2010 is not a complete year in this set, partial sales dropped as normal sales increased. This trend may be explained by developers finishing/halting their projects. Filtering/grouping by year built and/or neighborhood might help check this, but I’ll skip it for the sake of finishing.

Fall and winter months seemed to be where the bulk of these increases in normal sales fell each year.

3.42 Overall Correlations

Back to top.

To wrap up this notebook, and as a preliminary gauge on how well I have prepared the data for ML, mainly for linear regression, I’ll check out how correlations have changed. Have my variables increased in correlation in general? Have they increased in correlation to the target variable?

Increased correlations to the target variable have obvious benefits. Increased correlations between predictor variables will help clarify which variables may overlap in their predictive power and redundantly overweight the same underlying information.

I made a lot of new features, dropping some along the way. In cases where a Winsorized version seemed a good option, I also kept the scale-transformed version where applicable. For example, I kept both log(SalePrice) and Win(log(SalePrice)). Winsorization will help a straightforward linear regression without interactions. But, Winsorization will undercut KNN’s ability to cluster multivariate outliers and similarly RF’s ability to group by extremes. That said, Winsorization will reduce the chances of overfit in all three. All that is to say that there are redundant new features.

I can’t think of a quick and dirty way to view this without getting skewed results or busy heatmaps, other than to make a table of all the variables’ correlations to the target variable which isn’t very easily digested either. The slow and tedious way would be to iteratively “manually” (to some extent) select like variables for correlation within their experimental group. I’m not goin to do that.

val_train_Xy_numeric = select(
  val_train_Xy, # Reorder for easier comparison.
  c('SalePrice', 'log(SalePrice)', 'Win(log(SalePrice))', "LotFrontage",
    "log10(LotFrontage)", "Win(log10(LotFrontage))", "Win(LotFrontage)",
    "LotArea", "log10(log10(LotArea))", "Win(LotArea)", "OverallQual_int",
    "OverallCond_int", "YearBuilt", "sqrt(Age)", "YearRemodAdd", "MasVnrArea",
    "cbrt(MasVnrArea)", "Win(cbrt(MasVnrArea))", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "cbrt(BsmtUnfSF)", "square(log(TotalBsmtSF))",
    "Win(square(log(TotalBsmtSF)))", "TotalBsmtSF", "TotalBsmtFinSF",
    "sqrt(TotalBsmtFinSF)", "Win(sqrt(TotalBsmtFinSF))", "X2ndFlrSF",
    "X2ndFlr.bin", "GrLivArea", "square(log2(GrLivArea))", 
    "Win(square(log2(GrLivArea)))", "TotBaths", "Win(TotBaths)",
    "BedroomAbvGr", "Win(BedroomAbvGr)", "TotRmsAbvGrd", "Win(BedroomAbvGr)",
    "TotRmsAbvGrd", "Win(TotRmsAbvGrd)", "Fireplaces.bin", "GarageCars",
    "Win(GarageCars)", "WoodDeckSF", "cbrt(WoodDeckSF)",
    "Win(cbrt(WoodDeckSF))", "OpenPorchSF", "OpenPorch.bin", "EnclosedPorch",
    "ScreenPorch", "cbrt(ScreenPorch)", "ScreenPorch.bin", "MiscVal",
    "log(MiscVal)", "Win(log(MiscVal))", "MoSold", "YrSold")
)

ggcorr(val_train_Xy_numeric)

cor_mtx = cor(val_train_Xy_numeric, use = 'pairwise.complete.obs')

target_vars_vec = c('SalePrice', 'log(SalePrice)', 'Win(log(SalePrice))')

cor_mtx_melted = melt(cor_mtx)
sales_cor_mtx_melted = filter(
  cor_mtx_melted,
  Var1 %in% target_vars_vec & !(Var2 %in% target_vars_vec)
)

ggplot(sales_cor_mtx_melted, aes(x = Var1, y = Var2)) +
  geom_tile(aes(fill = value))

dcast(sales_cor_mtx_melted, formula = Var2 ~ Var1)
fenced_jbv(
  data = sales_cor_mtx_melted,
  x = 'Var1',
  y = 'value',
  jit_h = 0
)

4 Serialize Dataframe for Storage

Back to top.

I’ll write it to an RDS file so I can verify that my final engineering script duplicates this process. I’ll verify in the next notebook before I start modeling.

val_train_Xy$Id = id_col

saveRDS(val_train_Xy, 'data/eda_val_train_Xy.rds')
head(readRDS('data/eda_val_train_Xy.rds'))